library(alakazam)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(seriation)
library(ggpubr)
library(divo)
library(ggsci)
library(poweRlaw)
library(ggbeeswarm)
library(ggthemes)
library(wesanderson)
library(Bolstad2)
metadata <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/immunarch_data_376276/metadata_v2.txt')
Rearrangement_all <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/rearrangement_details_kit_376276/RearrangementDetails_06-25-2019_12-33-29_AM.tsv')
Rearrangement_all <- Rearrangement_all %>% dplyr::select(sample_name, amino_acid, templates, productive_frequency) %>% dplyr::rename('sample_id'=sample_name, 'clone_id'=amino_acid, 'seq_count'=templates, 'seq_freq'=productive_frequency)
kit1 <- merge(metadata, Rearrangement_all, by.x = 'Sample', by.y = 'sample_id')
#kit1 <- kit1 %>% dplyr::select(!group_code)

#metadata <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/immunarch_data_12511620/metadata.txt')
Rearrangement_all <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/rearrangement_details_kit_12511620/RearrangementDetails_07-20-2020_10-27-21_AM.tsv')
Rearrangement_all <- Rearrangement_all %>% dplyr::select(sample_name, amino_acid, templates, productive_frequency) %>% dplyr::rename('sample_id'=sample_name, 'clone_id'=amino_acid, 'seq_count'=templates, 'seq_freq'=productive_frequency)
kit2 <- merge(metadata, Rearrangement_all, by.x = 'Sample', by.y = 'sample_id')
#kit2 <- kit2 %>% dplyr::select(!group_code)
Rearrangement_all <- bind_rows(kit1, kit2)
head(Rearrangement_all)
##     Sample ID     COMET_ID COMET_ID_nr    group group_code Kit_ID
## 1 LivMet_1  1 COMET-0026M1          26 no_chemo          0 376276
## 2 LivMet_1  1 COMET-0026M1          26 no_chemo          0 376276
## 3 LivMet_1  1 COMET-0026M1          26 no_chemo          0 376276
## 4 LivMet_1  1 COMET-0026M1          26 no_chemo          0 376276
## 5 LivMet_1  1 COMET-0026M1          26 no_chemo          0 376276
## 6 LivMet_1  1 COMET-0026M1          26 no_chemo          0 376276
##             clone_id seq_count     seq_freq
## 1 CASSLVVGRFTMNTEAFF         1 3.064758e-05
## 2    CSAEPGEHGNQPQHF         1 3.064758e-05
## 3    CASSPDGISNQPQHF         1 3.064758e-05
## 4    CASRLRLYSNQPQHF         1 3.064758e-05
## 5  CASSKRHHLVANTEAFF         1 3.064758e-05
## 6     CASSHPRDSPGYTF         1 3.064758e-05

Make plot for T cell fraction

a <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/metadata/SampleOverview_Kit_376276.tsv')
a <- a[, c('sample_name', 'ng/uL')]
b <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/metadata/Overview_samples_kit_2_v2.tsv')
b <- b[, c('Sample_ID', 'ng_ul_DNA')]
colnames(b) <- c('sample_name', 'ng/uL')
c <- bind_rows(a, b)
c['uL'] = 32
c['ng'] = c['ng/uL'] * c['uL']
c['ng_pr_cell'] = 0.0066
c['total_genomes'] = c['ng'] / c['ng_pr_cell']

SampleOverview = read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/SampleOverview_10-11-2021_1-47-39_PM.tsv')
SampleOverview = SampleOverview %>% left_join(c, how='right', on='sample_name')
SampleOverview['tcell_fraction'] = SampleOverview['productive_templates'] / SampleOverview['total_genomes']
meta <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/metadata/metadata_all_v2.txt')
SampleOverview <- meta %>% left_join(SampleOverview, how='right', by=c('subject_id' = 'sample_name'))
SampleOverview <- SampleOverview %>% mutate('NACT-group' = case_when(group=='no_chemo'~'No-NACT',
                                                   group=='short_chemo'~'Short-interval',
                                                   group=='long_chemo'~'Long-interval'))
SampleOverview$`NACT-group` <- factor(SampleOverview$`NACT-group`, levels=c('No-NACT', 'Short-interval', 'Long-interval'))
same_patient <- c(
  'LivMet_38', 'LivMet_11', 'LivMet_14', 'LivMet_17',
  'LivMet_40', 'LivMet_42', 'LivMet_33'
) 
SampleOverview <- SampleOverview %>% filter(!subject_id %in% same_patient)
SampleOverview
## # A tibble: 85 × 28
##    filename  subject_id    ID COMET_ID COMET_ID_nr group group_code Kit_ID chemo
##    <chr>     <chr>      <dbl> <chr>          <dbl> <chr>      <dbl>  <dbl> <chr>
##  1 LivMet_1… LivMet_1       1 COMET-0…          26 no_c…          0 376276 no   
##  2 LivMet_1… LivMet_10     10 COMET-0…          24 no_c…          0 376276 no   
##  3 LivMet_1… LivMet_12     12 COMET-0…          27 long…          2 376276 yes  
##  4 LivMet_1… LivMet_13     13 COMET-0…          30 shor…          1 376276 yes  
##  5 LivMet_1… LivMet_15     15 COMET-0…          31 shor…          1 376276 yes  
##  6 LivMet_1… LivMet_16     16 COMET-0…          32 no_c…          0 376276 no   
##  7 LivMet_1… LivMet_18     18 COMET-0…          34 no_c…          0 376276 no   
##  8 LivMet_1… LivMet_19     19 COMET-0…          38 shor…          1 376276 yes  
##  9 LivMet_2… LivMet_2       2 COMET-0…          35 no_c…          0 376276 no   
## 10 LivMet_2… LivMet_20     20 COMET-0…          40 no_c…          0 376276 no   
## # … with 75 more rows, and 19 more variables: cluster <dbl>,
## #   total_templates <dbl>, productive_templates <dbl>,
## #   fraction_productive <dbl>, total_rearrangements <dbl>,
## #   productive_rearrangements <dbl>, productive_simpson_clonality <dbl>,
## #   max_productive_frequency <dbl>, locus <chr>, sample_tags <lgl>, sku <chr>,
## #   test_name <chr>, ng/uL <dbl>, uL <dbl>, ng <dbl>, ng_pr_cell <dbl>,
## #   total_genomes <dbl>, tcell_fraction <dbl>, NACT-group <fct>
SampleOverview %>% filter(group == 'no_chemo')
## # A tibble: 35 × 28
##    filename  subject_id    ID COMET_ID COMET_ID_nr group group_code Kit_ID chemo
##    <chr>     <chr>      <dbl> <chr>          <dbl> <chr>      <dbl>  <dbl> <chr>
##  1 LivMet_1… LivMet_1       1 COMET-0…          26 no_c…          0 376276 no   
##  2 LivMet_1… LivMet_10     10 COMET-0…          24 no_c…          0 376276 no   
##  3 LivMet_1… LivMet_16     16 COMET-0…          32 no_c…          0 376276 no   
##  4 LivMet_1… LivMet_18     18 COMET-0…          34 no_c…          0 376276 no   
##  5 LivMet_2… LivMet_2       2 COMET-0…          35 no_c…          0 376276 no   
##  6 LivMet_2… LivMet_20     20 COMET-0…          40 no_c…          0 376276 no   
##  7 LivMet_2… LivMet_22     22 COMET-0…          42 no_c…          0 376276 no   
##  8 LivMet_2… LivMet_25     25 COMET-0…          49 no_c…          0 376276 no   
##  9 LivMet_2… LivMet_26     26 COMET-0…          51 no_c…          0 376276 no   
## 10 LivMet_2… LivMet_27     27 COMET-0…          52 no_c…          0 376276 no   
## # … with 25 more rows, and 19 more variables: cluster <dbl>,
## #   total_templates <dbl>, productive_templates <dbl>,
## #   fraction_productive <dbl>, total_rearrangements <dbl>,
## #   productive_rearrangements <dbl>, productive_simpson_clonality <dbl>,
## #   max_productive_frequency <dbl>, locus <chr>, sample_tags <lgl>, sku <chr>,
## #   test_name <chr>, ng/uL <dbl>, uL <dbl>, ng <dbl>, ng_pr_cell <dbl>,
## #   total_genomes <dbl>, tcell_fraction <dbl>, NACT-group <fct>
SampleOverview %>% filter(group == 'short_chemo')
## # A tibble: 35 × 28
##    filename  subject_id    ID COMET_ID COMET_ID_nr group group_code Kit_ID chemo
##    <chr>     <chr>      <dbl> <chr>          <dbl> <chr>      <dbl>  <dbl> <chr>
##  1 LivMet_1… LivMet_13     13 COMET-0…          30 shor…          1 376276 yes  
##  2 LivMet_1… LivMet_15     15 COMET-0…          31 shor…          1 376276 yes  
##  3 LivMet_1… LivMet_19     19 COMET-0…          38 shor…          1 376276 yes  
##  4 LivMet_2… LivMet_23     23 COMET-0…          43 shor…          1 376276 yes  
##  5 LivMet_2… LivMet_24     24 COMET-0…          46 shor…          1 376276 yes  
##  6 LivMet_3… LivMet_30     30 COMET-0…          55 shor…          1 376276 yes  
##  7 LivMet_3… LivMet_32     32 COMET-0…          62 shor…          1 376276 yes  
##  8 LivMet_3… LivMet_37     37 COMET-0…           5 shor…          1 376276 yes  
##  9 LivMet_4… LivMet_41     41 COMET-0…          37 shor…          1 376276 yes  
## 10 LivMet_4… LivMet_46     46 COMET-0…          83 shor…          1 376276 yes  
## # … with 25 more rows, and 19 more variables: cluster <dbl>,
## #   total_templates <dbl>, productive_templates <dbl>,
## #   fraction_productive <dbl>, total_rearrangements <dbl>,
## #   productive_rearrangements <dbl>, productive_simpson_clonality <dbl>,
## #   max_productive_frequency <dbl>, locus <chr>, sample_tags <lgl>, sku <chr>,
## #   test_name <chr>, ng/uL <dbl>, uL <dbl>, ng <dbl>, ng_pr_cell <dbl>,
## #   total_genomes <dbl>, tcell_fraction <dbl>, NACT-group <fct>
SampleOverview %>% filter(group == 'long_chemo')
## # A tibble: 15 × 28
##    filename  subject_id    ID COMET_ID COMET_ID_nr group group_code Kit_ID chemo
##    <chr>     <chr>      <dbl> <chr>          <dbl> <chr>      <dbl>  <dbl> <chr>
##  1 LivMet_1… LivMet_12     12 COMET-0…          27 long…          2 3.76e5 yes  
##  2 LivMet_4… LivMet_43     43 COMET-0…          74 long…          2 3.76e5 yes  
##  3 LivMet_4… LivMet_44     44 COMET-0…          75 long…          2 3.76e5 yes  
##  4 LivMet_4… LivMet_45     45 COMET-0…          76 long…          2 3.76e5 yes  
##  5 LivMet_6… LivMet_62     62 COMET-0…         105 long…          2 1.25e7 yes  
##  6 LivMet_6… LivMet_68     68 COMET-0…         185 long…          2 1.25e7 yes  
##  7 LivMet_8… LivMet_84     84 COMET-0…          80 long…          2 1.25e7 yes  
##  8 LivMet_8… LivMet_85     85 COMET-0…         131 long…          2 1.25e7 yes  
##  9 LivMet_8… LivMet_86     86 COMET-0…         138 long…          2 1.25e7 yes  
## 10 LivMet_8… LivMet_87     87 COMET-0…         151 long…          2 1.25e7 yes  
## 11 LivMet_8… LivMet_88     88 COMET-0…         182 long…          2 1.25e7 yes  
## 12 LivMet_9… LivMet_90     90 COMET-0…         206 long…          2 1.25e7 yes  
## 13 LivMet_9… LivMet_91     91 COMET-0…         233 long…          2 1.25e7 yes  
## 14 LivMet_9… LivMet_92     92 COMET-0…         242 long…          2 1.25e7 yes  
## 15 LivMet_9… LivMet_93     93 COMET-0…         267 long…          2 1.25e7 yes  
## # … with 19 more variables: cluster <dbl>, total_templates <dbl>,
## #   productive_templates <dbl>, fraction_productive <dbl>,
## #   total_rearrangements <dbl>, productive_rearrangements <dbl>,
## #   productive_simpson_clonality <dbl>, max_productive_frequency <dbl>,
## #   locus <chr>, sample_tags <lgl>, sku <chr>, test_name <chr>, ng/uL <dbl>,
## #   uL <dbl>, ng <dbl>, ng_pr_cell <dbl>, total_genomes <dbl>,
## #   tcell_fraction <dbl>, NACT-group <fct>
SampleOverview %>% filter(group %in% c('short_chemo', 'long_chemo'))
## # A tibble: 50 × 28
##    filename  subject_id    ID COMET_ID COMET_ID_nr group group_code Kit_ID chemo
##    <chr>     <chr>      <dbl> <chr>          <dbl> <chr>      <dbl>  <dbl> <chr>
##  1 LivMet_1… LivMet_12     12 COMET-0…          27 long…          2 376276 yes  
##  2 LivMet_1… LivMet_13     13 COMET-0…          30 shor…          1 376276 yes  
##  3 LivMet_1… LivMet_15     15 COMET-0…          31 shor…          1 376276 yes  
##  4 LivMet_1… LivMet_19     19 COMET-0…          38 shor…          1 376276 yes  
##  5 LivMet_2… LivMet_23     23 COMET-0…          43 shor…          1 376276 yes  
##  6 LivMet_2… LivMet_24     24 COMET-0…          46 shor…          1 376276 yes  
##  7 LivMet_3… LivMet_30     30 COMET-0…          55 shor…          1 376276 yes  
##  8 LivMet_3… LivMet_32     32 COMET-0…          62 shor…          1 376276 yes  
##  9 LivMet_3… LivMet_37     37 COMET-0…           5 shor…          1 376276 yes  
## 10 LivMet_4… LivMet_41     41 COMET-0…          37 shor…          1 376276 yes  
## # … with 40 more rows, and 19 more variables: cluster <dbl>,
## #   total_templates <dbl>, productive_templates <dbl>,
## #   fraction_productive <dbl>, total_rearrangements <dbl>,
## #   productive_rearrangements <dbl>, productive_simpson_clonality <dbl>,
## #   max_productive_frequency <dbl>, locus <chr>, sample_tags <lgl>, sku <chr>,
## #   test_name <chr>, ng/uL <dbl>, uL <dbl>, ng <dbl>, ng_pr_cell <dbl>,
## #   total_genomes <dbl>, tcell_fraction <dbl>, NACT-group <fct>
get_CI_half_width <- function(x, prob) {
  n <- length(x)
  z_t <- qt(1 - (1 - prob) / 2, df = n - 1)
  z_t * sd(x) / sqrt(n)
}

lower <- function(x, prob = 0.95) {
  mean(x) - get_CI_half_width(x, prob)
}

upper <- function(x, prob = 0.95) {
  mean(x) + get_CI_half_width(x, prob)
}

Mean number of productive CDR3Beta templates per sample

templates <- SampleOverview$productive_templates

# mean templates
paste("Mean templates:", round(mean(templates), 0))
## [1] "Mean templates: 83098"
# lower CI
paste("Lower 95% CI:  ", round(lower(templates), 0))
## [1] "Lower 95% CI:   66614"
# upper CI
paste("Upper 95% CI:  ", round(upper(templates), 0))
## [1] "Upper 95% CI:   99582"
# median templates
paste("Median templates:", round(median(templates), 3))
## [1] "Median templates: 63063"

Mean T cell fraction per sample

fraction <- SampleOverview$tcell_fraction

# mean fraction
paste("Mean fraction:", round(mean(fraction), 3))
## [1] "Mean fraction: 0.081"
# lower CI
paste("Lower 95% CI: ", round(lower(fraction), 3))
## [1] "Lower 95% CI:  0.065"
# upper CI
paste("Upper 95% CI: ", round(upper(fraction), 3))
## [1] "Upper 95% CI:  0.096"
# median fraction
paste("Median fraction:", round(median(fraction), 3))
## [1] "Median fraction: 0.057"

Mean T cell fraction per sample short interval

fraction <- SampleOverview %>% filter(group == 'short_chemo') %>% pull(tcell_fraction)

# mean fraction
paste("Mean fraction:", round(mean(fraction), 3))
## [1] "Mean fraction: 0.106"
# lower CI
paste("Lower 95% CI: ", round(lower(fraction), 3))
## [1] "Lower 95% CI:  0.074"
# upper CI
paste("Upper 95% CI: ", round(upper(fraction), 3))
## [1] "Upper 95% CI:  0.138"
# median fraction
paste("Median fraction:", round(median(fraction), 3))
## [1] "Median fraction: 0.076"

Mean T cell fraction per sample no chemo

fraction <- SampleOverview %>% filter(group == 'no_chemo') %>% pull(tcell_fraction)

# mean fraction
paste("Mean fraction:", round(mean(fraction), 3))
## [1] "Mean fraction: 0.063"
# lower CI
paste("Lower 95% CI: ", round(lower(fraction), 3))
## [1] "Lower 95% CI:  0.048"
# upper CI
paste("Upper 95% CI: ", round(upper(fraction), 3))
## [1] "Upper 95% CI:  0.079"
# median fraction
paste("Median fraction:", round(median(fraction), 3))
## [1] "Median fraction: 0.057"

Mean T cell fraction per sample long chemo

fraction <- SampleOverview %>% filter(group == 'long_chemo') %>% pull(tcell_fraction)

# mean fraction
print(paste("Mean fraction:", round(mean(fraction), 3)))
## [1] "Mean fraction: 0.062"
# lower CI
print(paste("Lower 95% CI: ", round(lower(fraction), 3)))
## [1] "Lower 95% CI:  0.034"
# upper CI
print(paste("Upper 95% CI: ", round(upper(fraction), 3)))
## [1] "Upper 95% CI:  0.089"
# median fraction
paste("Median fraction:", round(median(fraction), 3))
## [1] "Median fraction: 0.048"
            theme(axis.text.x = element_text(angle=60, hjust=1)
my_comparisons <- list(c('No-NACT', 'Short-interval'), 
                       c('No-NACT', 'Long-interval'),
                       c('Short-interval', 'Long-interval'))

p <- ggviolin(SampleOverview, x='NACT-group', y='tcell_fraction',
               color='NACT-group', palette='jco',
               add='dotplot', scale='count')


p <- p + stat_compare_means(comparisons = my_comparisons, method='t.test',label.y=c(.45,.75, .6)) +
  ylab("T cell fraction") + theme_minimal() + 
  theme(legend.position='none', axis.text.x = element_text(angle=60, hjust=1))


ggsave(filename = '/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/tcr_seq_t_fraction.pdf',
       plot = p, width=4, height=5)
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
p
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Total unique clones

comb_rear_all <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/CombinedRearrangements_ALL_COMET.tsv')
print(paste('Total unique clones:', comb_rear_all %>% pull(`Amino Acid`) %>% length()))
## [1] "Total unique clones: 1413435"

Median (min-max) unique clones

print(paste("Median unique clones:",SampleOverview$productive_rearrangements %>% median()))
## [1] "Median unique clones: 15596"
print(paste("Min unique clones:",SampleOverview$productive_rearrangements %>% min()))
## [1] "Min unique clones: 1476"
print(paste("Max unique clones:",SampleOverview$productive_rearrangements %>% max()))
## [1] "Max unique clones: 66976"

Metadata

metadata <- bind_rows(read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/immunarch_data_376276/metadata_v2.txt') %>% dplyr::select(!group_code),
          read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/immunarch_data_12511620/metadata_v2.txt') %>% dplyr::select(!group_code))
kit_1_qc <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/qc_reports/qcReport_kit_1.tsv')
kit_2_qc <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/qc_reports/qcReport_kit_2.tsv')

qc_report <- bind_rows(kit_1_qc, kit_2_qc)
## Warning: One or more parsing issues, see `problems()` for details

## Warning: One or more parsing issues, see `problems()` for details
coverage <- qc_report %>% select(`Sample Name`, Coverage)
colnames(coverage) <- c("sample_id", "Coverage")
coverage
## # A tibble: 95 × 2
##    sample_id Coverage
##    <chr>        <dbl>
##  1 LivMet_1      26  
##  2 LivMet_10     10.8
##  3 LivMet_11     28.7
##  4 LivMet_12      5.1
##  5 LivMet_13      3.8
##  6 LivMet_14      4.7
##  7 LivMet_15     15.8
##  8 LivMet_16     30.6
##  9 LivMet_17     19.6
## 10 LivMet_18     37.4
## # … with 85 more rows

Read hill diversity and evenness profiles

# Set path to hill diversity profiles
path <- '/Users/eirikhoy/dropbox/projects/airr_tools/results/comet/hill_div/'
list_of_files <- list.files(path = path,
                            recursive = TRUE,
                            pattern = "\\.tsv$",
                            full.names = TRUE)

df <- readr::read_tsv(list_of_files)
df <- df %>% left_join(metadata[, c("Sample", "COMET_ID")], by=c("sample_id"="Sample")) %>%
  left_join(coverage)



### new analysis, exclude coverage < 5 and patient duplicates

duplicate_samples <- c('LivMet_38', 'LivMet_11', 'LivMet_14', 'LivMet_17',
                       'LivMet_40', 'LivMet_42', 'LivMet_33')


df <- df %>% filter(!sample_id %in% duplicate_samples) %>%
  filter(COMET_ID != 'COMET-0041M1') %>% # remove because NACT-interval data missing
  filter(Coverage >= 5)

nr_samples <- df$sample_id %>% unique() %>% length()

print(paste("Number of samples after selecting for coverage >= 5:", nr_samples))
## [1] "Number of samples after selecting for coverage >= 5: 77"
annotation <- metadata %>% 
  filter(group != 'na') %>%
  dplyr::select(COMET_ID, group) %>%
  as.data.frame()
rownames(annotation) <- annotation$COMET_ID
annotation$COMET_ID <- NULL

diversity_profile <- df %>% 
  dplyr::select(COMET_ID, q, e) %>%
  pivot_wider(names_from = COMET_ID, values_from = e) %>%
  as.data.frame()

rownames(diversity_profile) <- diversity_profile$q
diversity_profile$q <- NULL
diversity_profile <- as.matrix(diversity_profile)

nr_categories <- length(unique(annotation$group))
ann_colors <- list(group = c(no_chemo='#0073C2FF', short_chemo="#EFC000FF", long_chemo="#3B3B3BFF"))

names(ann_colors$group) <- unique(unique(annotation$group))

quantile_breaks <- function(xs, n = 10) {
  breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
  breaks[!duplicated(breaks)]
}

cl_cb <- function(hcl, mat){
    # Recalculate manhattan distances for reorder method
    dists <- dist(mat, method = "manhattan")

    # Perform reordering according to OLO method
    hclust_olo <- reorder(hcl, dists)
    return(hclust_olo)
}
diversity_profile_breaks <- quantile_breaks(diversity_profile, n = 10)

pheatmap(diversity_profile, 
         annotation = annotation, 
         cluster_rows = TRUE,
         annotation_colors = ann_colors, 
         border_color = NA,
         main = paste('Evenness Profile\n', "n =", nr_samples), 
         fontsize = 7.5, 
         color=inferno(length(diversity_profile_breaks) - 1),
         breaks = diversity_profile_breaks,
         clustering_callback = cl_cb

         )

diversity_profile_breaks <- quantile_breaks(cor(diversity_profile), n = 10)

pheatmap(cor(diversity_profile), 
         annotation_col = annotation, 
         annotation_row = annotation, 
         annotation_colors = ann_colors, 
         clustering_distance_cols = "manhattan",
         clustering_distance_rows = "manhattan",
         border_color = NA,
         main = paste('Correlation Map Evenness Profile\n', 'n=', nr_samples), 
         fontsize = 7.5, 
         color = inferno(10),
         #show_rownames = FALSE,
         #show_colnames = FALSE,
         
         #color = inferno(length(diversity_profile_breaks) - 1),
         #breaks = diversity_profile_breaks
         )

# reorder clusters according to OLO method

out <- pheatmap(cor(diversity_profile), 
         annotation_col = annotation, 
         annotation_row = annotation, 
         #cluster_rows = TRUE,
         
         annotation_colors = ann_colors, 
         clustering_distance_cols = "manhattan",
         clustering_distance_rows = "manhattan",
         main = paste('Correlation Map Evenness Profile\n', 'n=', nr_samples), 
         fontsize = 7.5, 
         #color = inferno(length(diversity_profile_breaks) - 1),
         #breaks = diversity_profile_breaks
         clustering_callback = cl_cb,
#         legend=FALSE,
#         annotation_legend=FALSE,
         #show_rownames = FALSE,
         #show_colnames = FALSE,
         border_color = NA,
         color = inferno(10)
         #cutree_rows = 6,
         #cutree_cols = 6
         )

dendrogram with cutoff at h=2.5

height = 2.5

plot(out$tree_col)
abline(h=height, col='red', lty=2, lwd=2)

#Cut the row (gene) dendrogram at a Euclidean distance dis-similarity of 8
d = data.frame(sort(cutree(out$tree_col, h=height)))
d = tibble(
  'COMET_ID' = rownames(d),
  'cluster' = as.character(d[,1])
)
t <- d %>% merge(metadata, by.x='COMET_ID', by.y='COMET_ID') %>% dplyr::select(group, cluster)# %>% filter(cluster != 6)
write_tsv(d %>% left_join(metadata %>% dplyr::select(c(Sample, COMET_ID)), by='COMET_ID'), file='/Users/eirikhoy/dropbox/Manuscript_ImmunoSEQ_COMET/supplementary_files/results/hill_even_clust_dist_3.tsv')
#t$group[t$group == 'long_chemo'] <- 'yes'
#t$group[t$group == 'short_chemo'] <- 'yes'
#t$group[t$group == 'no_chemo'] <- 'no'

Fisher’s Exact Test for Count Data

t <- table(t)
print(t)
##              cluster
## group          1  2  3  4  5  6
##   long_chemo   6  6  0  1  2  0
##   no_chemo    10  7  6  8  0  1
##   short_chemo 11 11  1  1  5  1
#chisq <- chisq.test(t, simulate.p.value = FALSE, B=10000)
#print(chisq)
fish <- fisher.test(t)
print(fish)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  t
## p-value = 0.02132
## alternative hypothesis: two.sided
get_CI_half_width <- function(x, prob) {
  n <- length(x)
  z_t <- qt(1 - (1 - prob) / 2, df = n - 1)
  z_t * sd(x) / sqrt(n)
}

lower <- function(x, prob = 0.95) {
  mean(x) - get_CI_half_width(x, prob)
}

upper <- function(x, prob = 0.95) {
  mean(x) + get_CI_half_width(x, prob)
}

no_chemo_ <- df %>% 
  merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% 
  as_tibble() %>% filter(group == 'no_chemo') %>%
  group_by(q) %>% summarize_at(vars(c('d','e')), funs(mean, sd, min, max, lower, upper)) %>%
  mutate(group='no_chemo')
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
short_chemo_ <- df %>% 
  merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% 
  as_tibble() %>% filter(group == 'short_chemo') %>%
  group_by(q) %>% summarize_at(vars(c('d','e')), funs(mean, sd, min, max, lower, upper)) %>%
  mutate(group='short_chemo')

long_chemo_ <- df %>% 
  merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% 
  as_tibble() %>% filter(group == 'long_chemo') %>%
  group_by(q) %>% summarize_at(vars(c('d','e')), funs(mean, sd, min, max, lower, upper)) %>%
  mutate(group='long_chemo')

df_hill <- bind_rows(bind_rows(no_chemo_, short_chemo_), long_chemo_) %>%
  mutate(group = factor(group, levels=c("no_chemo", "short_chemo", "long_chemo")))
p1 <- df_hill %>%
  mutate(`NACT-group` = case_when(
    group == 'no_chemo' ~ 'No-NACT',
    group == 'short_chemo' ~ 'Short-interval',
    group == 'long_chemo' ~ 'Long-interval'
  )) %>%
  mutate(`NACT-group` = factor(`NACT-group`, levels = c('No-NACT', 'Short-interval', 'Long-interval'))) %>%
  ggplot(aes(x=q, y=d_mean, group=`NACT-group`, color=`NACT-group`)) +
  geom_line() +
  geom_ribbon(aes(ymin = d_lower, ymax = d_upper, fill=`NACT-group`), alpha = 0.1, color=NA) +
  theme_classic() +
  xlab(expression(alpha)) + ylab('Hill diversity') + 
  theme(axis.title.x = element_text(face='italic')) +
  scale_x_continuous(breaks=c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) + theme(legend.title = element_blank())

p1 <- set_palette(p1, 'jco')
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/hill_div_prof.pdf',
       plot=p1,
       width=6, height=4)


p1

Species richness all NACT groups

species_richness_all <- df %>% merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% filter(q == 0.0) %>% pull(d)
print(paste("N =", length(species_richness_all)))
## [1] "N = 77"
print(paste("Mean species richness:", round(mean(species_richness_all),0)))
## [1] "Mean species richness: 16000"
print(paste("Lower CI species richness:", round(lower(species_richness_all),0)))
## [1] "Lower CI species richness: 13611"
print(paste("Upper CI species richness:", round(upper(species_richness_all),0)))
## [1] "Upper CI species richness: 18388"

Species richness short-interval NACT group

species_richness_short_chemo <- df %>% merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% filter(group == 'short_chemo') %>% filter(q == 0.0) %>% pull(d)
print(paste("N =", length(species_richness_short_chemo)))
## [1] "N = 30"
print(paste("Mean species richness:", round(mean(species_richness_short_chemo),0)))
## [1] "Mean species richness: 18020"
print(paste("Lower CI species richness:", round(lower(species_richness_short_chemo),0)))
## [1] "Lower CI species richness: 13524"
print(paste("Upper CI species richness:", round(upper(species_richness_short_chemo),0)))
## [1] "Upper CI species richness: 22515"

Species richness long-interval NACT group

species_richness_long_chemo <- df %>% merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% filter(group == 'long_chemo') %>% filter(q == 0.0) %>% pull(d)
print(paste("N =", length(species_richness_long_chemo)))
## [1] "N = 15"
print(paste("Mean species richness:", round(mean(species_richness_long_chemo),0)))
## [1] "Mean species richness: 16079"
print(paste("Lower CI species richness:", round(lower(species_richness_long_chemo),0)))
## [1] "Lower CI species richness: 10394"
print(paste("Upper CI species richness:", round(upper(species_richness_long_chemo),0)))
## [1] "Upper CI species richness: 21764"

Species richness No-NACT group

species_richness_no_chemo <- df %>% merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% filter(group == 'no_chemo') %>% filter(q == 0.0) %>% pull(d)
print(paste("N =", length(species_richness_no_chemo)))
## [1] "N = 32"
print(paste("Mean species richness:", round(mean(species_richness_no_chemo),0)))
## [1] "Mean species richness: 14069"
print(paste("Lower CI species richness:", round(lower(species_richness_no_chemo),0)))
## [1] "Lower CI species richness: 10833"
print(paste("Upper CI species richness:", round(upper(species_richness_no_chemo),0)))
## [1] "Upper CI species richness: 17304"
p2 <- df_hill %>%
  ggplot(aes(x=q, y=e_mean, group=group, color=group)) +
  geom_line() +
  geom_ribbon(aes(ymin = e_lower, ymax = e_upper, fill=group), alpha = 0.1, color=NA) +
  theme_classic() +
  xlab(expression(alpha)) + ylab('Hill evenness') + 
  theme(axis.title.x = element_text(face='italic')) +
  scale_x_continuous(breaks=c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10))

p2 <- set_palette(p2, 'jco')
p2

leg <- get_legend(p1)
p3 <- ggarrange(p1, p2, ncol=2, nrow=1,
          legend.grob = leg, 
          legend='right')
ggsave(filename = '/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/hill_div_even.pdf',
       plot = p3, width = 9, height=3)

Comparison of Evenness profiles and AUC

samples <- unique(df$COMET_ID)

hill_auc <- matrix(ncol=1, nrow=0)
#rownames(even_auc) <- samples

for (i in samples){
  x <- df %>% filter(COMET_ID == i) %>% pull(q)
  y <- df %>% filter(COMET_ID == i) %>% pull(e)
  auc <- sintegral(x, y)$int
  hill_auc[i] <- auc
}

hill_auc <- data.frame(hill_auc)
hill_auc['COMET_ID'] <- rownames(hill_auc)
hill_auc <- as_tibble(hill_auc)
hill_auc <- hill_auc %>% merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% as_tibble()
hill_auc <- hill_auc %>% filter(group != 'na')
hill_auc <- hill_auc %>% filter(!Sample %in% duplicate_samples)
hill_auc$group <- factor(hill_auc$group, levels=c('no_chemo', 'short_chemo', 'long_chemo'))
hill_auc <- left_join(x=hill_auc, y=qc_report, by=c("Sample" = "Sample Name")) 

Species richness dependence on coverage

# lm species richness versus coverage
lm.fit <- df %>% merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% filter(q == 0.0) %>%
  lm(.$d ~ .$Coverage, data=.)
summary(lm.fit)
## 
## Call:
## lm(formula = .$d ~ .$Coverage, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11804  -5343  -1420   4514  27243 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27636.35    1745.94  15.829  < 2e-16 ***
## .$Coverage   -634.62      81.64  -7.773 3.22e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7884 on 75 degrees of freedom
## Multiple R-squared:  0.4462, Adjusted R-squared:  0.4388 
## F-statistic: 60.42 on 1 and 75 DF,  p-value: 3.22e-11
confint(lm.fit, level=0.95)
##                  2.5 %     97.5 %
## (Intercept) 24158.2641 31114.4323
## .$Coverage   -797.2652  -471.9846
# plot species richness versus coverage
df %>% merge(metadata, by.x = 'COMET_ID', by.y = 'COMET_ID') %>% filter(q == 0.0) %>%
  ggplot(aes(x=Coverage, y=d)) + 
  geom_point() +
  geom_smooth(method='lm', color='red') + 
  theme_minimal() +
  ylab("Species Richness") + xlab("Coverage")
## `geom_smooth()` using formula 'y ~ x'

Species richness is significantly associated with coverage

AUC evenness profile dependence on coverage

# lm hill evenness auc versus coverage
lm.fit <- lm(hill_auc ~ Coverage, data=hill_auc)
summary(lm.fit)
## 
## Call:
## lm(formula = hill_auc ~ Coverage, data = hill_auc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3526 -0.7069 -0.1350  0.5973  2.5010 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.25846    0.24952  17.067   <2e-16 ***
## Coverage     0.01477    0.01167   1.266    0.209    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.127 on 75 degrees of freedom
## Multiple R-squared:  0.02092,    Adjusted R-squared:  0.007868 
## F-statistic: 1.603 on 1 and 75 DF,  p-value: 0.2094
confint(lm.fit, level=0.95)
##                    2.5 %     97.5 %
## (Intercept)  3.761397043 4.75551660
## Coverage    -0.008472166 0.03801431
# plot hill evenness auc versus coverage
p <- hill_auc %>%
  ggplot(aes(x=Coverage, y=hill_auc)) +
  geom_point() + geom_vline(xintercept = 5, linetype = 'dotted', color='red', size=1.5) +
  geom_smooth(method='lm', color='red')

hill_auc <- hill_auc %>% filter(Coverage >= 5)

hill_auc <- hill_auc %>% mutate("NACT-group" = factor(case_when(group == 'no_chemo' ~ 'No-NACT',
                                                 group == 'short_chemo' ~ 'Short-interval',
                                                 group == 'long_chemo' ~ 'Long-interval'),
                                                 levels=c('No-NACT', 'Short-interval', 'Long-interval')))

hill_auc['AUC_group'] <- cut(hill_auc$hill_auc, 3, labels=FALSE)
hill_auc <- hill_auc %>% mutate(Clonality = 10 - hill_auc)

p
## `geom_smooth()` using formula 'y ~ x'

There was no significant association between area under the curve of hill evenness profiles and coverage

library(ggpubr)

my_comparisons <- list(c('No-NACT', 'Short-interval'), 
                       c('No-NACT', 'Long-interval'),
                       c('Short-interval', 'Long-interval'))

p <- ggboxplot(hill_auc, x='NACT-group', y='hill_auc',
               color='NACT-group', palette='jco',
               add='jitter')

p + stat_compare_means(comparisons = my_comparisons, method='t.test') +
  ylab("AUC evenness profile") + xlab("") + theme_minimal() + 
  theme(axis.text.x = element_text(angle=60, hjust=1), 
        legend.position='none')

library(ggpubr)


my_comparisons <- list(c('No-NACT', 'Short-interval'), 
                       c('No-NACT', 'Long-interval'),
                       c('Short-interval', 'Long-interval'))

p <- ggboxplot(hill_auc, x='NACT-group', y='Clonality',
               color='NACT-group', palette='jco',
               add='jitter')

p <- p + stat_compare_means(comparisons = my_comparisons, method='t.test',
                            aes(label = paste0("p = ", ..p.format..))) +
  ylab("Clonality") + xlab("") + theme_minimal() + 
  theme(axis.text.x = element_text(angle=60, hjust=1), 
        legend.position='none')
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/Clonality_AUC.pdf',
       plot=p,
       width=3, height=6)

p

compare_means(Clonality ~ group, data=hill_auc,
              method='t.test',
              p.adjust.method = 'fdr')
## # A tibble: 3 × 8
##   .y.       group1      group2            p p.adj p.format p.signif method
##   <chr>     <chr>       <chr>         <dbl> <dbl> <chr>    <chr>    <chr> 
## 1 Clonality no_chemo    short_chemo 0.00411 0.012 0.0041   **       T-test
## 2 Clonality no_chemo    long_chemo  0.0280  0.042 0.0280   *        T-test
## 3 Clonality short_chemo long_chemo  0.709   0.71  0.7092   ns       T-test
hill_auc
## # A tibble: 77 × 18
##    COMET_ID   hill_auc Sample    ID COMET_ID_nr group  Kit_ID `Kit UID` `Run ID`
##    <chr>         <dbl> <chr>  <dbl>       <dbl> <fct>   <dbl>     <dbl> <chr>   
##  1 COMET-000…     5.72 LivMe…    35           3 no_ch… 376276    376276 376276_1
##  2 COMET-000…     6.38 LivMe…    36           4 no_ch… 376276    376276 376276_1
##  3 COMET-000…     3.70 LivMe…    37           5 short… 376276    376276 376276_1
##  4 COMET-001…     3.55 LivMe…     6          10 no_ch… 376276    376276 376276_1
##  5 COMET-001…     5.57 LivMe…     5          11 no_ch… 376276    376276 376276_1
##  6 COMET-001…     4.15 LivMe…     7          15 no_ch… 376276    376276 376276_1
##  7 COMET-001…     4.17 LivMe…     8          17 no_ch… 376276    376276 376276_1
##  8 COMET-001…     3.93 LivMe…     9          18 short… 376276    376276 376276_1
##  9 COMET-002…     5.27 LivMe…    39          23 no_ch… 376276    376276 376276_1
## 10 COMET-002…     3.98 LivMe…    10          24 no_ch… 376276    376276 376276_1
## # … with 67 more rows, and 9 more variables: Well Location <chr>,
## #   Date Posted <chr>, Gene Rearrangements <dbl>, Coverage <dbl>,
## #   % On-target <dbl>, QC Status <chr>, NACT-group <fct>, AUC_group <int>,
## #   Clonality <dbl>

Comparison of pCRC sidedness

library(ggpubr)

sidedness <- readxl::read_xlsx('/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/tables/metadata.xlsx') %>% select(Sample, pCRC_Location)
hill_auc_with_sidedness <- hill_auc %>% left_join(sidedness) %>% 
  filter(!pCRC_Location %in% c("na")) %>%
  mutate(pCRC_Location = case_when(
    pCRC_Location == "right_colon" ~ "Right Side",
    pCRC_Location == "left_colon" ~ "Left Side",
    pCRC_Location == "rectum" ~ "Rectum",
    pCRC_Location == "transverse_colon" ~ "Left Side"
  )) %>%
  mutate(pCRC_Location = factor(pCRC_Location, levels = c("Right Side", "Left Side", "Rectum")))
## Joining, by = "Sample"
my_comparisons <- list(c('Left Side', 'Right Side'), 
                       c('Left Side', 'Rectum'),
                       c('Right Side', 'Rectum'))

p <- ggboxplot(hill_auc_with_sidedness, x='pCRC_Location', y='Clonality',
               color='pCRC_Location', #palette='jco',
               add='jitter')

p <- p + stat_compare_means(comparisons = my_comparisons, method='t.test',
                            aes(label = paste0("p = ", ..p.format..))) +
  ylab("Clonality") + xlab("") + theme_minimal() + 
  theme(axis.text.x = element_text(angle=60, hjust=1), 
        legend.position='none')
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/Clonality_AUC_pcrc_sidedness.pdf',
       plot=p,
       width=3, height=6)

p

sidedness$pCRC_Location
##  [1] "right_colon"      "rectum"           "left_colon"       "right_colon"     
##  [5] "left_colon"       "left_colon"       "rectum"           "rectum"          
##  [9] "rectum"           "rectum"           "right_colon"      "rectum"          
## [13] "rectum"           "left_colon"       "left_colon"       "left_colon"      
## [17] "left_colon"       "left_colon"       "left_colon"       "rectum"          
## [21] "right_colon"      "right_colon"      "rectum"           "right_colon"     
## [25] "left_colon"       "transverse_colon" "left_colon"       "left_colon"      
## [29] "left_colon"       "right_colon"      "rectum"           "rectum"          
## [33] "right_colon"      "rectum"           "left_colon"       "left_colon"      
## [37] "rectum"           "rectum"           "rectum"           "right_colon"     
## [41] "left_colon"       "right_colon"      "right_colon"      "left_colon"      
## [45] "right_colon"      "left_colon"       "rectum"           "rectum"          
## [49] "rectum"           "left_colon"       "rectum"           "right_colon"     
## [53] "rectum"           "rectum"           "rectum"           "rectum"          
## [57] "right_colon"      "left_colon"       "left_colon"       "rectum"          
## [61] "rectum"           "rectum"           "rectum"           "rectum"          
## [65] "rectum"           "rectum"           "rectum"           "rectum"          
## [69] "left_colon"       "left_colon"       "left_colon"       "rectum"          
## [73] "rectum"           "left_colon"       "left_colon"       "na"              
## [77] "right_colon"      "na"               "left_colon"       "right_colon"     
## [81] "left_colon"       "rectum"           "left_colon"       "left_colon"      
## [85] "rectum"           "right_colon"      "left_colon"       "rectum"          
## [89] "left_colon"       "left_colon"       "left_colon"       "na"

Species richness all NACT groups

clonality_all <- hill_auc %>% pull(Clonality)
print(paste("N =", length(clonality_all)))
## [1] "N = 77"
print(paste("Mean Clonality:", round(mean(clonality_all),1)))
## [1] "Mean Clonality: 5.5"
print(paste("Lower CI Clonality:", round(lower(clonality_all),1)))
## [1] "Lower CI Clonality: 5.2"
print(paste("Upper CI Clonality:", round(upper(clonality_all),1)))
## [1] "Upper CI Clonality: 5.7"

Species richness short-interval NACT group

clonality_short_chemo <- hill_auc %>% filter(group == 'short_chemo') %>% pull(Clonality)
print(paste("N =", length(clonality_short_chemo)))
## [1] "N = 30"
print(paste("Mean Clonality:", round(mean(clonality_short_chemo),1)))
## [1] "Mean Clonality: 5.8"
print(paste("Lower CI Clonality:", round(lower(clonality_short_chemo),1)))
## [1] "Lower CI Clonality: 5.5"
print(paste("Upper CI Clonality:", round(upper(clonality_short_chemo),1)))
## [1] "Upper CI Clonality: 6.2"

Species richness long-interval NACT group

clonality_long_chemo <- hill_auc %>% filter(group == 'long_chemo') %>% pull(Clonality)
print(paste("N =", length(clonality_long_chemo)))
## [1] "N = 15"
print(paste("Mean Clonality:", round(mean(clonality_long_chemo),1)))
## [1] "Mean Clonality: 5.7"
print(paste("Lower CI Clonality:", round(lower(clonality_long_chemo),1)))
## [1] "Lower CI Clonality: 5.2"
print(paste("Upper CI Clonality:", round(upper(clonality_long_chemo),1)))
## [1] "Upper CI Clonality: 6.2"

Species richness No-NACT group

clonality_no_chemo <- hill_auc %>% filter(group == 'no_chemo') %>% pull(Clonality)
print(paste("N =", length(clonality_no_chemo)))
## [1] "N = 32"
print(paste("Mean Clonality:", round(mean(clonality_no_chemo),1)))
## [1] "Mean Clonality: 5"
print(paste("Lower CI Clonality:", round(lower(clonality_no_chemo),1)))
## [1] "Lower CI Clonality: 4.6"
print(paste("Upper CI Clonality:", round(upper(clonality_no_chemo),1)))
## [1] "Upper CI Clonality: 5.4"
lm.fit <- left_join(hill_auc, Rearrangement_all %>% group_by(COMET_ID) %>% 
                        mutate(nr_tcells = sum(seq_count)) %>% 
                        dplyr::select(COMET_ID, nr_tcells) %>% distinct()) %>%
  lm(Clonality ~ nr_tcells, data=.)
## Joining, by = "COMET_ID"
summary(lm.fit)
## 
## Call:
## lm(formula = Clonality ~ nr_tcells, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.59439 -0.67233 -0.05764  0.79094  1.94934 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.604e+00  2.574e-01  17.891  < 2e-16 ***
## nr_tcells   8.433e-06  2.189e-06   3.852 0.000494 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.029 on 34 degrees of freedom
##   (41 observations deleted due to missingness)
## Multiple R-squared:  0.3038, Adjusted R-squared:  0.2834 
## F-statistic: 14.84 on 1 and 34 DF,  p-value: 0.0004938
confint(lm.fit, level=0.95)
##                    2.5 %       97.5 %
## (Intercept) 4.081417e+00 5.127430e+00
## nr_tcells   3.983955e-06 1.288189e-05
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
p <- left_join(hill_auc, Rearrangement_all %>% group_by(COMET_ID) %>% 
                        mutate(nr_tcells = sum(seq_count)) %>% 
                        dplyr::select(COMET_ID, nr_tcells) %>% distinct()) %>%
  ggplot(aes(x=nr_tcells, y=Clonality)) +
  geom_point() +
  geom_smooth(method='lm', color='red') +
  theme_minimal() +
  ylab("Clonality") + xlab("Number of T cells") +
  scale_x_continuous(labels = comma)
## Joining, by = "COMET_ID"
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/lm_clonality_nr_t_cells.pdf',
       plot=p,
       width=5, height=5)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 41 rows containing non-finite values (stat_smooth).
## Warning: Removed 41 rows containing missing values (geom_point).
p
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 41 rows containing non-finite values (stat_smooth).

## Warning: Removed 41 rows containing missing values (geom_point).

meta1 <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/immunarch_data_376276/metadata.txt') %>% dplyr::select(-group_code)
## Rows: 46 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sample, COMET_ID, group, group_code
## dbl (3): ID, COMET_ID_nr, Kit_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
meta2 <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/immunarch_data_12511620/metadata.txt') %>% dplyr::select(-group_code)
## Rows: 47 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Sample, COMET_ID, group
## dbl (4): ID, COMET_ID_nr, group_code, Kit_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metadata <- bind_rows(meta1, meta2)
no_chemo_names <- metadata %>% filter(group == 'no_chemo') %>% pull(Sample)
short_chemo_names <- metadata %>% filter(group == 'short_chemo') %>% pull(Sample)
long_chemo_names <- metadata %>% filter(group == 'long_chemo') %>% pull(Sample)
comb_rear_all <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/CombinedRearrangements_ALL_COMET.tsv')
## Rows: 1413435 Columns: 98
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): Amino Acid
## dbl (97): Sum (Templates), Present In, LivMet_1, LivMet_10, LivMet_11, LivMe...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
columns <- colnames(comb_rear_all[, 4:ncol(comb_rear_all)])
columns <- columns[1:(length(columns)-2)]
df_ <- data.frame(
  'Sample' = columns
)
columns <- df_ %>% left_join(metadata) %>% pull(COMET_ID)
## Joining, by = "Sample"
colnames(comb_rear_all) <- c("Amino Acid", "Sum (Templates)", "Present In", columns, "Negative", "Positive")
comb_rear_all
## # A tibble: 1,413,435 × 98
##    `Amino Acid`      `Sum (Templates)` `Present In` `COMET-0026M1` `COMET-0024M1`
##    <chr>                         <dbl>        <dbl>          <dbl>          <dbl>
##  1 CASSLRTGATLHAFF               34204            2              0              0
##  2 CASSQEPDIKPNQPQHF             28532            1              0              0
##  3 CASSIMAAGGDGYTF               14526            1              0              0
##  4 CASSEFRVGYEQYF                12134            1              0              0
##  5 CASSNRGGNEQFF                 11557            3              0              0
##  6 CASSQNRDGEQFF                 11278            2              0              0
##  7 CASSPPDRNTEAFF                10984            2              0              0
##  8 CATKDSKNTEAFF                  9903            2              0              0
##  9 CASRGSMNTEAFF                  8848            8              0              0
## 10 CASSETGQGHGYTF                 8508            1              0              0
## # … with 1,413,425 more rows, and 93 more variables: COMET-0026M2 <dbl>,
## #   COMET-0027M3 <dbl>, COMET-0030M1 <dbl>, COMET-0030M2 <dbl>,
## #   COMET-0031M1 <dbl>, COMET-0032M1 <dbl>, COMET-0032M2 <dbl>,
## #   COMET-0034M1 <dbl>, COMET-0038M1 <dbl>, COMET-0035M1_1 <dbl>,
## #   COMET-0040M1 <dbl>, COMET-0041M1 <dbl>, COMET-0042M1 <dbl>,
## #   COMET-0043M1 <dbl>, COMET-0046M1 <dbl>, COMET-0049M1 <dbl>,
## #   COMET-0051M1 <dbl>, COMET-0052M1 <dbl>, COMET-0053M1 <dbl>, …
public_private <- comb_rear_all %>% 
  mutate(public_private = case_when(`Present In` == 1 ~ "Private",
                                    `Present In` > 1 ~ "Public")) %>%
  dplyr::select(`Amino Acid`, `Sum (Templates)`, `Present In`, public_private)
public_private
## # A tibble: 1,413,435 × 4
##    `Amino Acid`      `Sum (Templates)` `Present In` public_private
##    <chr>                         <dbl>        <dbl> <chr>         
##  1 CASSLRTGATLHAFF               34204            2 Public        
##  2 CASSQEPDIKPNQPQHF             28532            1 Private       
##  3 CASSIMAAGGDGYTF               14526            1 Private       
##  4 CASSEFRVGYEQYF                12134            1 Private       
##  5 CASSNRGGNEQFF                 11557            3 Public        
##  6 CASSQNRDGEQFF                 11278            2 Public        
##  7 CASSPPDRNTEAFF                10984            2 Public        
##  8 CATKDSKNTEAFF                  9903            2 Public        
##  9 CASRGSMNTEAFF                  8848            8 Public        
## 10 CASSETGQGHGYTF                 8508            1 Private       
## # … with 1,413,425 more rows

Fraction public versus private clones

nr_private <- public_private %>% filter(public_private == 'Private') %>% pull(public_private) %>% length()
nr_public <- public_private %>% filter(public_private == 'Public') %>% pull(public_private) %>% length()
total_clones <- nr_private + nr_public
fraction_private <- nr_private / total_clones
fraction_public <- nr_public / total_clones


print(paste("Public Clones:", nr_public))
## [1] "Public Clones: 137245"
print(paste("Fraction Public:", fraction_public))
## [1] "Fraction Public: 0.0971003265095317"
print(paste("Private Clones:", nr_private))
## [1] "Private Clones: 1276190"
print(paste("Fraction Private:", fraction_private))
## [1] "Fraction Private: 0.902899673490468"
df_public_private <- data.frame(
  "type" = c("Private", "Public"),
  "Percentage" = c(fraction_private*100, fraction_public*100)
  #"Private" = fraction_private,
  #"Public" = fraction_public
)
df_public_private$type <- factor(df_public_private$type, levels=c("Public", "Private"))
p <- df_public_private %>% 
  ggplot(aes(x=type, y=Percentage, fill=type)) + 
  scale_fill_jama() +#values=c("#E7B800", "#FC4E07")) +
  geom_col(color='black') + 
  theme_classic() + xlab("") +
  scale_y_continuous(name="% of Total Clonltypes", limits=c(0, 100), breaks=seq(0, 100, 25)) +
  theme(legend.position="none")
  
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/public_vs_private_percentage.pdf',
       plot=p,
       width=1.5, height=4)
p

x <- as.data.frame(comb_rear_all %>% dplyr::select(!c("Positive", "Negative")))
rownames(x) <- x$`Amino Acid`
x <- x[, 4:(ncol(x))]
result <- mh(x, PlugIn = TRUE, graph=TRUE)

### Comparison of similarity of samples from the same patient.

same_patient <- c("LivMet_16", "LivMet_17", "LivMet_32", "LivMet_33", "LivMet_3",  "LivMet_42", 
                  "LivMet_2",  "LivMet_40", "LivMet_38", "LivMet_5",  "LivMet_13", "LivMet_14")
same_patient <- c("COMET-0026M1", "COMET-0026M2", 
                  "COMET-0032M1", "COMET-0032M2", 
                  "COMET-0062M1",  "COMET-0062M2", 
                  "COMET-0059M1_1",  "COMET-0059M1_2", 
                  "COMET-0035M1_1", "COMET-0035M1_2",  
                  "COMET-0011M1_1", "COMET-0011M1_2",
                  "COMET-0030M1","COMET-0030M2")
mh_same_patient <- result$PlugIn[same_patient, same_patient]
pheatmap(mh_same_patient, cluster_rows = FALSE, cluster_cols = FALSE)

same_patient_cols <- c("COMET-0026M1", "COMET-0032M1", "COMET-0062M1", "COMET-0059M1_1", "COMET-0035M1_1", "COMET-0011M1_1", "COMET-0030M1")
same_patient_rows <- c("COMET-0026M2", "COMET-0032M2", "COMET-0062M2", "COMET-0059M1_2", "COMET-0035M1_2", "COMET-0011M1_2", "COMET-0030M2")

same_patient_mh <- c(
  mh_same_patient["COMET-0026M2",  "COMET-0026M1"],
  mh_same_patient["COMET-0032M2",  "COMET-0032M1"],
  mh_same_patient["COMET-0062M2",  "COMET-0062M1"],
  mh_same_patient["COMET-0030M2",  "COMET-0030M1"],
  mh_same_patient["COMET-0059M1_2","COMET-0059M1_1"],
  mh_same_patient["COMET-0035M1_2","COMET-0035M1_1"],
  mh_same_patient["COMET-0011M1_1","COMET-0011M1_2"])

mh_same_vs_diff_df <- data.frame(
  'mh_val' = same_patient_mh
)
mh_same_vs_diff_df['type'] <- c('Same Patient Different Metastasis',
                                'Same Patient Different Metastasis',
                                'Same Patient Different Metastasis',
                                'Same Patient Different Metastasis',
                                'Same Patient Same Metastasis',
                                'Same Patient Same Metastasis',
                                'Same Patient Same Metastasis'
                                )



non_same <- metadata %>% filter(!COMET_ID %in% same_patient) %>% pull(COMET_ID)
mh_non_same <- result$PlugIn[non_same, non_same]
df_ <- data.frame(
  'mh_val' = mh_non_same[lower.tri(mh_non_same)] )
df_['type'] <- 'Different Patient'
df_ <- as_tibble(df_)
mh_same_vs_diff_df <- as_tibble(mh_same_vs_diff_df)
mh_same_vs_diff_df <- bind_rows(mh_same_vs_diff_df, df_)

mh_same_vs_diff_df$type <- factor(mh_same_vs_diff_df$type, levels=c("Different Patient",
                                                                             "Same Patient Different Metastasis",
                                                                             "Same Patient Same Metastasis"))

my_comparisons <- list( c('Same Patient Same Metastasis', 'Different Patient'),
                        c('Same Patient Different Metastasis', 'Same Patient Same Metastasis'), 
                        c('Same Patient Different Metastasis', 'Different Patient')
                      )

p <- ggboxplot(mh_same_vs_diff_df, x='type', y='mh_val',
               fill='type', palette=c("#00AFBB", "#E7B800", "#FC4E07"),#'jam',
               add='jitter') + theme(legend.position='none')# +
 # ylim(c(0, 1.2)))

p <- p + stat_compare_means(comparisons = my_comparisons, method='t.test', label.y=c(1,1.15, .85)) +
  scale_y_continuous(name="MH-Index", limits=c(0, 1.2), breaks=seq(0, 1, .25)) + xlab("") + 
  theme(axis.text.x = element_text(size=9,
        vjust = grid::unit(c(-2, 0, -2), "points")))
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/morisita_horn.pdf',
       plot=p,
       width=5, height=4)
## Warning: Removed 120 rows containing missing values (geom_point).
p
## Warning: Removed 116 rows containing missing values (geom_point).

print(paste("Mean Same Patient Same Metastasis: ", round(mh_same_vs_diff_df %>% filter(type=='Same Patient Same Metastasis') %>% pull(mh_val) %>% mean(),2)))
## [1] "Mean Same Patient Same Metastasis:  0.93"
print(paste("Min Same Patient Same Metastasis: ", round(mh_same_vs_diff_df %>% filter(type=='Same Patient Same Metastasis') %>% pull(mh_val) %>% min(),2)))
## [1] "Min Same Patient Same Metastasis:  0.86"
print(paste("Max Same Patient Same Metastasis: ", round(mh_same_vs_diff_df %>% filter(type=='Same Patient Same Metastasis') %>% pull(mh_val) %>% max(),2)))
## [1] "Max Same Patient Same Metastasis:  0.97"
print(paste("Mean Same Patient Different Metastasis: ", round(mh_same_vs_diff_df %>% filter(type=='Same Patient Different Metastasis') %>% pull(mh_val) %>% mean(),2)))
## [1] "Mean Same Patient Different Metastasis:  0.45"
print(paste("Mean Min Patient Different Metastasis: ", round(mh_same_vs_diff_df %>% filter(type=='Same Patient Different Metastasis') %>% pull(mh_val) %>% min(),2)))
## [1] "Mean Min Patient Different Metastasis:  0.09"
print(paste("Mean Max Patient Different Metastasis: ", round(mh_same_vs_diff_df %>% filter(type=='Same Patient Different Metastasis') %>% pull(mh_val) %>% max(),2)))
## [1] "Mean Max Patient Different Metastasis:  0.97"
print(paste("Mean Different Patient: ", round(mh_same_vs_diff_df %>% filter(type=='Different Patient') %>% pull(mh_val) %>% mean(),10)))
## [1] "Mean Different Patient:  0.000700742"
print(paste("Min Different Patient: ", round(mh_same_vs_diff_df %>% filter(type=='Different Patient') %>% pull(mh_val) %>% min(),5)))
## [1] "Min Different Patient:  0"
print(paste("Max Different Patient: ", round(mh_same_vs_diff_df %>% filter(type=='Different Patient') %>% pull(mh_val) %>% max(),5)))
## [1] "Max Different Patient:  0.06873"
graph_param_random <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/graph_params_random_public.tsv')
df <- as.data.frame(graph_param_random %>% dplyr::select(-one_of('COMET_ID', 'COMET_ID_nr', 'group', 'Kit_ID',
                                                                 'ld', 'group_code', 'ID')) %>% drop_na())
rownames(df) <- df$sample
df$sample <- NULL
df <- mutate_all(df, function(x) as.numeric(as.character(x)))
df['Sample'] <- rownames(df)
df <- left_join(df, metadata %>% dplyr::select(Sample, group))
df <- df %>% filter(group != 'na')

graph_param_random <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/graph_params_random_private.tsv')
df2 <- as.data.frame(graph_param_random %>% dplyr::select(-one_of('COMET_ID', 'COMET_ID_nr', 'group', 'Kit_ID',
                                                                 'ld', 'group_code', 'ID')))# %>% drop_na())
rownames(df2) <- df2$sample
df2$sample <- NULL
df2 <- mutate_all(df2, function(x) as.numeric(as.character(x)))
df2['Sample'] <- rownames(df2)
df2 <- left_join(df2, metadata %>% dplyr::select(Sample, group))
df2 <- df2 %>% filter(group != 'na')


df <- df[, 1:19] %>% pivot_longer(!Sample, names_to='parameter', values_to = "value")
df['private_public'] <- 'Public'
df2 <- df2[, 1:19] %>% pivot_longer(!Sample, names_to='parameter', values_to = "value")
df2['private_public'] <- 'Private'
df <- bind_rows(df, df2)
df <- df %>% left_join(coverage, by=c("Sample"="sample_id"))
df <- df %>% filter(Coverage >= 5)
df
## # A tibble: 2,772 × 5
##    Sample   parameter         value private_public Coverage
##    <chr>    <chr>             <dbl> <chr>             <dbl>
##  1 LivMet_1 transitivity    0.233   Public               26
##  2 LivMet_1 authority       0.00795 Public               26
##  3 LivMet_1 page_rank       0.00100 Public               26
##  4 LivMet_1 eigen_centr     0.00795 Public               26
##  5 LivMet_1 n_nodes      1000       Public               26
##  6 LivMet_1 n_edges       130       Public               26
##  7 LivMet_1 max_degree      5       Public               26
##  8 LivMet_1 largest_comp   19       Public               26
##  9 LivMet_1 max_k_core      2       Public               26
## 10 LivMet_1 diameter        8       Public               26
## # … with 2,762 more rows
comparison <- df %>% filter(parameter == 'n_edges')


my_comparisons <- list(c('Private', 'Public'))

p <- ggboxplot(comparison, x='private_public', y='value',
               color='private_public', palette='jama',
               add='jitter') + theme(legend.position='none')

p <- p + stat_compare_means(comparisons = my_comparisons, method='t.test') +
  ylab("Number of Connections (Edges)") + xlab("")
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/connectivity_public_vs_private.pdf',
       plot=p,
       width=4, height=5)
p

print(paste("Nr Public:", df %>% filter(parameter=='n_edges') %>% filter(private_public == "Public") %>% pull(Sample) %>% length()))
## [1] "Nr Public: 77"
print(paste("Nr Private:", df %>% filter(parameter=='n_edges') %>% filter(private_public == "Public") %>% pull(Sample) %>% length()))
## [1] "Nr Private: 77"
top_10perc_mcpas <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/top_public_connected_rear/top_10perc_mcpas_hits.tsv')
## New names:
## * `` -> ...1
## Rows: 92 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): ...1
## dbl (2): nr_private_hits, nr_public_hits
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top_10perc_mcpas <- top_10perc_mcpas %>% rename("sample_name" = "...1")
top_10perc_mcpas <- top_10perc_mcpas %>%
  pivot_longer(-sample_name, names_to='group', values_to='Fraction McPAS hits') %>%
  mutate(group = case_when(
    group == 'nr_private_hits' ~ 'Top 10% most\nexpanded clones',
    group == 'nr_public_hits' ~ 'Top 10% clones by\nconnectivity and\nsharing level'
  )) %>%
  mutate(group = factor(group, levels=c('Top 10% clones by\nconnectivity and\nsharing level',
                                        'Top 10% most\nexpanded clones'))) %>%
  mutate(`Fraction McPAS hits`=`Fraction McPAS hits`*100)

comparison <- top_10perc_mcpas

my_comparisons <- list(c('Top 10% most\nexpanded clones', 'Top 10% clones by\nconnectivity and\nsharing level'))

p <- ggboxplot(comparison, x='group', y='Fraction McPAS hits',
               color='group', palette="nejm",
               add='jitter') + theme(legend.position='none')

p <- p + stat_compare_means(comparisons = my_comparisons, method='t.test', paired=TRUE) +
  ylab("Fraction CDR3 amino acid hits to McPAS database") + xlab("")
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/mcpas_hits_most_clonal_vs_most_connected_and_public.pdf',
       plot=p,
       width=4, height=5)
p

public_fraction <- comparison %>% filter(group == "Top 10% clones by\nconnectivity and\nsharing level") 
print(paste("Mean Top 10% clones by connectivity and sharing level connectivity:", public_fraction %>% pull(`Fraction McPAS hits`) %>% mean())) #%>% round(0) / 10))
## [1] "Mean Top 10% clones by connectivity and sharing level connectivity: 10.0293821311376"
print(paste("Lower CI Top 10% clones by connectivity and sharing level connectivity:", public_fraction %>% pull(`Fraction McPAS hits`) %>% lower())) #%>% round(0) / 10))
## [1] "Lower CI Top 10% clones by connectivity and sharing level connectivity: 9.68561598462979"
print(paste("Upper CI Top 10% clones by connectivity and sharing level connectivity:", public_fraction %>% pull(`Fraction McPAS hits`) %>% upper())) #%>% round(0) / 10))
## [1] "Upper CI Top 10% clones by connectivity and sharing level connectivity: 10.3731482776455"
private_fraction <- comparison %>% filter(group == "Top 10% most\nexpanded clones") 
print(paste("Mean Top 10% most expanded clones connectivity:", private_fraction %>% pull(`Fraction McPAS hits`) %>% mean())) #%>% round(0) / 10))
## [1] "Mean Top 10% most expanded clones connectivity: 2.07607803089866"
print(paste("Lower CI Top 10% most expanded clones connectivity:", private_fraction %>% pull(`Fraction McPAS hits`) %>% lower())) #%>% round(0) / 10))
## [1] "Lower CI Top 10% most expanded clones connectivity: 1.91647388830312"
print(paste("Upper CI Top 10% most expanded clones connectivity:", private_fraction %>% pull(`Fraction McPAS hits`) %>% upper())) #%>% round(0) / 10))
## [1] "Upper CI Top 10% most expanded clones connectivity: 2.2356821734942"
meta <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/immunarch_data/metadata.txt')
meta
## # A tibble: 93 × 7
##    Sample       ID COMET_ID       COMET_ID_nr group       group_code Kit_ID
##    <chr>     <dbl> <chr>                <dbl> <chr>       <chr>       <dbl>
##  1 LivMet_1      1 COMET-0026M1            26 no_chemo    0          376276
##  2 LivMet_2      2 COMET-0035M1_1          35 no_chemo    0          376276
##  3 LivMet_3      3 COMET-0059M1_1          59 no_chemo    0          376276
##  4 LivMet_4      4 COMET-0078M1            78 no_chemo    0          376276
##  5 LivMet_5      5 COMET-0011M1_1          11 no_chemo    0          376276
##  6 LivMet_6      6 COMET-0010M1            10 no_chemo    0          376276
##  7 LivMet_7      7 COMET-0015M1            15 no_chemo    0          376276
##  8 LivMet_8      8 COMET-0017M1            17 no_chemo    0          376276
##  9 LivMet_9      9 COMET-0018M2            18 short_chemo 1          376276
## 10 LivMet_10    10 COMET-0024M1            24 no_chemo    0          376276
## # … with 83 more rows
df2 <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/imnet_output/powerlaw.tsv')

same_patient <- c(
  'LivMet_38', 'LivMet_11', 'LivMet_14', 'LivMet_17',
  'LivMet_40', 'LivMet_42', 'LivMet_33'
) 


df2 <- df2 %>% filter(!Sample %in% same_patient) %>% left_join(coverage, by=c("Sample" = "sample_id")) %>% filter(Coverage >= 5) %>%
  filter(Sample != "LivMet_21")

p <- df2 %>%
  filter(group != 'na') %>%
  mutate(group = fct_relevel(group,
                            'no_chemo', 'short_chemo', 'long_chemo')) %>%
  mutate(group = case_when(group == "no_chemo" ~ "No-NACT",
                           group == "short_chemo" ~ "Short-interval",
                           group == "long_chemo" ~ "Long-interval")) %>% 
  mutate(group = fct_relevel(group, "No-NACT", "Short-interval", "Long-interval")) %>%
  ggplot(aes(x=group, y=p_val, fill=group), color='black') +
  geom_quasirandom(pch=21, size=3) +
  geom_hline(yintercept=.1, linetype='dashed', color='black') +
  scale_y_continuous(breaks=seq(0,1,.1), limits = c(0,1)) +
  theme_minimal() +
  theme(legend.position="none") +
 # scale_fill_manual(values = wes_palette(n=3, name="GrandBudapest1")) +
  ylab("P-value") + xlab("")
p <- set_palette(p, "jco")
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/powerlaw.pdf',
       plot=p,
       width=5, height=5)
p

Datasets that passed the power law goodness of fit test

df2 %>% left_join(hill_auc) %>% filter(p_val > 0.1)
## Joining, by = c("Sample", "ID", "COMET_ID", "COMET_ID_nr", "group", "Kit_ID", "Coverage")
## # A tibble: 9 × 20
##   Sample    p_val    ID COMET_ID   COMET_ID_nr group  group_code Kit_ID Coverage
##   <chr>     <dbl> <dbl> <chr>            <dbl> <chr>  <chr>       <dbl>    <dbl>
## 1 LivMet_2  0.143     2 COMET-003…          35 no_ch… 0          3.76e5     31.6
## 2 LivMet_23 0.154    23 COMET-004…          43 short… 1          3.76e5     24.8
## 3 LivMet_28 0.126    28 COMET-005…          53 no_ch… 0          3.76e5     20.4
## 4 LivMet_3  0.129     3 COMET-005…          59 no_ch… 0          3.76e5     29.9
## 5 LivMet_4  0.172     4 COMET-007…          78 no_ch… 0          3.76e5      6.7
## 6 LivMet_47 0.2      47 COMET-008…          82 no_ch… 0          1.25e7     11.1
## 7 LivMet_64 0.432    64 COMET-014…         140 short… 1          1.25e7     25.6
## 8 LivMet_69 0.134    69 COMET-018…         186 short… 1          1.25e7     20.2
## 9 LivMet_79 0.133    79 COMET-026…         266 short… 1          1.25e7     40.3
## # … with 11 more variables: hill_auc <dbl>, Kit UID <dbl>, Run ID <chr>,
## #   Well Location <chr>, Date Posted <chr>, Gene Rearrangements <dbl>,
## #   % On-target <dbl>, QC Status <chr>, NACT-group <fct>, AUC_group <int>,
## #   Clonality <dbl>
df2 %>% left_join(hill_auc) %>% filter(p_val > 0.1) %>% pull(Clonality) %>% mean() %>% round(1)
## [1] 4.4
df2 %>% left_join(hill_auc) %>% filter(p_val >= 0.0) %>% drop_na() %>% pull(Clonality) %>% mean() %>% round(1)
## [1] 5.5
global_param_subset <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/global_param_subset.tsv')
global_param_subset['Sample'] <- global_param_subset['...1']
global_param_subset['...1'] <- NULL
global_param_subset
## # A tibble: 77 × 11
##    nr_nodes number_edges largest_component maximal_k_core diameter assortativity
##       <dbl>        <dbl>             <dbl>          <dbl>    <dbl>         <dbl>
##  1     8611          947               109              6       21         0.389
##  2     6852          727                66              4       16         0.361
##  3    34849        13527              4371             10       43         0.458
##  4    17894         4462              1026              9       19         0.507
##  5     9218         1144               168              6       25         0.435
##  6    10109         1307               171              4       20         0.444
##  7    10019         1492               357              5       20         0.395
##  8    18256         4915              1180             11       24         0.517
##  9     2351          109                 8              4        4         0.663
## 10     4818          447                38              4       16         0.434
## # … with 67 more rows, and 5 more variables: average_cluster <dbl>,
## #   average_degree <dbl>, clustering_coeff <dbl>, density <dbl>, Sample <chr>
global_param_subset <- global_param_subset %>% left_join(metadata) %>% left_join(coverage, by=c("Sample"="sample_id"))
global_param_subset <- global_param_subset %>% filter(Coverage > 5)
global_param_subset['connectivity_fraction'] <- global_param_subset$number_edges / global_param_subset$nr_nodes
global_param_subset
## # A tibble: 76 × 18
##    nr_nodes number_edges largest_component maximal_k_core diameter assortativity
##       <dbl>        <dbl>             <dbl>          <dbl>    <dbl>         <dbl>
##  1     8611          947               109              6       21         0.389
##  2     6852          727                66              4       16         0.361
##  3    34849        13527              4371             10       43         0.458
##  4    17894         4462              1026              9       19         0.507
##  5     9218         1144               168              6       25         0.435
##  6    10109         1307               171              4       20         0.444
##  7    10019         1492               357              5       20         0.395
##  8    18256         4915              1180             11       24         0.517
##  9     2351          109                 8              4        4         0.663
## 10     4818          447                38              4       16         0.434
## # … with 66 more rows, and 12 more variables: average_cluster <dbl>,
## #   average_degree <dbl>, clustering_coeff <dbl>, density <dbl>, Sample <chr>,
## #   ID <dbl>, COMET_ID <chr>, COMET_ID_nr <dbl>, group <chr>, Kit_ID <dbl>,
## #   Coverage <dbl>, connectivity_fraction <dbl>

Mean nr nodes

print(paste("Mean number of nodes:", round(mean(global_param_subset$nr_nodes), 0)))
## [1] "Mean number of nodes: 16056"
print(paste("Lower CI of nodes:   ", round(lower(global_param_subset$nr_nodes), 0)))
## [1] "Lower CI of nodes:    13637"
print(paste("Upper CI of nodes:   ", round(upper(global_param_subset$nr_nodes), 0)))
## [1] "Upper CI of nodes:    18475"

Mean nr edges

print(paste("Mean number of edges:", round(mean(global_param_subset$number_edges), 0)))
## [1] "Mean number of edges: 4133"
print(paste("Lower CI of edges:   ", round(lower(global_param_subset$number_edges), 0)))
## [1] "Lower CI of edges:    3015"
print(paste("Upper CI of edges:   ", round(upper(global_param_subset$number_edges), 0)))
## [1] "Upper CI of edges:    5251"

Mean connectivity fraction

print(paste("Mean connectivity:", round(mean(global_param_subset$connectivity_fraction), 4)))
## [1] "Mean connectivity: 0.1897"
print(paste("Lower CI of connectivity:   ", round(lower(global_param_subset$connectivity_fraction), 4)))
## [1] "Lower CI of connectivity:    0.1647"
print(paste("Upper CI of connectivity:   ", round(upper(global_param_subset$connectivity_fraction), 4)))
## [1] "Upper CI of connectivity:    0.2146"
degree_share = read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/degree_share.tsv')

degree_share <- degree_share %>% arrange(share_level)
degree_share$share_level[degree_share$share_level == 0] <- 1
degree_share
## # A tibble: 1,671,005 × 2
##    degree share_level
##     <dbl>       <dbl>
##  1      0           1
##  2      0           1
##  3      0           1
##  4      0           1
##  5      0           1
##  6      0           1
##  7      0           1
##  8      0           1
##  9      0           1
## 10      0           1
## # … with 1,670,995 more rows
p <- degree_share %>%
  ggplot(aes(share_level, degree)) + 
  stat_summary(fun = mean, geom = 'bar', fill='darkgrey') +
  stat_summary(fun.data = "mean_cl_normal", 
               geom = 'errorbar',
               width = .4) +
  theme_classic() +
  xlab("Number of patients") + ylab("Clonal connections (Degrees)") +
  scale_x_continuous(breaks=c(1, 10, 20, 30, 40, 50))
ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/degree_to_sharing.pdf',
       plot=p,
       width=6, height=4)
p

connectivity_to_nodes <- read_tsv("/Users/eirikhoy/Dropbox/projects/airr_comet/data/connectivity_fraction_to_nr_nodes.tsv")[, 2:3]
attach(connectivity_to_nodes)
lm.fit <- lm(fraction ~ nr_nodes, data=connectivity_to_nodes)
summary(lm.fit)
## 
## Call:
## lm(formula = fraction ~ nr_nodes, data = connectivity_to_nodes)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.053419 -0.019535 -0.005885  0.009346  0.179879 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.862e-02  6.203e-03   6.227 1.87e-08 ***
## nr_nodes    9.300e-06  2.767e-07  33.608  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03319 on 83 degrees of freedom
## Multiple R-squared:  0.9315, Adjusted R-squared:  0.9307 
## F-statistic:  1129 on 1 and 83 DF,  p-value: < 2.2e-16
confint(lm.fit, level=0.95)
##                    2.5 %       97.5 %
## (Intercept) 2.628540e-02 5.095917e-02
## nr_nodes    8.749917e-06 9.850738e-06
p <- connectivity_to_nodes %>%
  ggplot(aes(x=nr_nodes, y=fraction)) +
  geom_point() +
  theme_minimal() +
  ylab("Fraction connections to clones") + xlab("Number of clones (nodes)") +
  geom_smooth(method = 'lm', se=TRUE, level=.95, color='red')

ggsave(filename='/Users/eirikhoy/Dropbox/Manuscript_ImmunoSEQ_COMET/figures/connectivity_to_nodes.pdf',
       plot=p,
       width=6, height=5)
## `geom_smooth()` using formula 'y ~ x'
p
## `geom_smooth()` using formula 'y ~ x'

library(tidyverse)
survivial_data <- read_delim("/Users/eirikhoy/Dropbox/projects/airr_comet/data/metadata/survival_data.csv", delim=';')
## New names:
## * n_status -> n_status...13
## * n_status -> n_status...17
## Rows: 85 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (10): Sample, COMET_ID, pathology_id, date_surgery, pcrc_loc, t_status, ...
## dbl (10): age, sex, ASA, bmi, ECOG, tstage_updated, n_status...17, status, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
evenness_AUC <- read_tsv('/Users/eirikhoy/Dropbox/projects/airr_comet/data/evenness_auc.tsv')
## Rows: 92 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): COMET_ID, Sample, group, Run ID, Well Location, Date Posted, QC Sta...
## dbl (9): Evenenss_AUC, ID, COMET_ID_nr, Kit_ID, Kit UID, Gene Rearrangements...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
evenness_AUC <- evenness_AUC %>% dplyr::select(Sample, Evenenss_AUC, AUC_group)
evenness_AUC <- evenness_AUC %>% mutate(Clonality = case_when(AUC_group == 1 ~ 'High',
                                          AUC_group == 2 ~ "Mid",
                                          AUC_group == 3 ~ "Low"))
survivial_data <- survivial_data %>% left_join(evenness_AUC)
## Joining, by = "Sample"
survivial_data$Clonality <- as.factor(survivial_data$Clonality)
attach(survivial_data)
table(status)
## status
##  0  1 
## 44 41
table(Clonality)
## Clonality
## High  Low  Mid 
##   26   16   43

Overall survival

library(survival)
fit.surv <- survfit(Surv(survival, status) ~ 1)
plot(fit.surv, xlab = "Months", 
     ylab = "Estimated Probability of Survival")

Comparison Survival ~ clonality all levels

Clonality <- as.factor(Clonality)
fit.clonal <- survfit(Surv(survival, status) ~ Clonality)
plot(fit.clonal, xlab = "Months",
     ylab = "Estimated Probability of Survival", col = c(2,3,4))
legend("bottomleft", levels(Clonality), col = c(2,3, 4), lty = 1)

logrank.test <- survdiff(Surv(survival, status) ~ Clonality)
logrank.test
## Call:
## survdiff(formula = Surv(survival, status) ~ Clonality)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## Clonality=High 26       11     12.0   0.07939    0.1128
## Clonality=Low  16       10      9.7   0.00906    0.0121
## Clonality=Mid  43       20     19.3   0.02383    0.0453
## 
##  Chisq= 0.1  on 2 degrees of freedom, p= 0.9
fit.cox <- coxph(Surv(survival, status) ~ Clonality)
summary(fit.cox)
## Call:
## coxph(formula = Surv(survival, status) ~ Clonality)
## 
##   n= 85, number of events= 41 
## 
##                coef exp(coef) se(coef)     z Pr(>|z|)
## ClonalityLow 0.1163    1.1233   0.4405 0.264    0.792
## ClonalityMid 0.1198    1.1273   0.3758 0.319    0.750
## 
##              exp(coef) exp(-coef) lower .95 upper .95
## ClonalityLow     1.123     0.8902    0.4737     2.664
## ClonalityMid     1.127     0.8871    0.5397     2.354
## 
## Concordance= 0.528  (se = 0.045 )
## Likelihood ratio test= 0.11  on 2 df,   p=0.9
## Wald test            = 0.11  on 2 df,   p=0.9
## Score (logrank) test = 0.11  on 2 df,   p=0.9
summary(fit.cox)$logtest[1]
##      test 
## 0.1147714
summary(fit.cox)$waldtest[1]
## test 
## 0.11
summary(fit.cox)$sctest[1]
##      test 
## 0.1128919

Comparison Survival ~ Clonality only high versus low

subset <- (Clonality != "Mid")
Clonality <- as.factor(Clonality[])
fit.clonal <- survfit(Surv(survival, status) ~ Clonality, 
                      subset = subset)
plot(fit.clonal, xlab = "Months",
     ylab = "Estimated Probability of Survival", col = c(2,4))
legend("bottomleft", levels(factor(c("High", "Low"), levels=c("High", "Low"))), col = c(2,4), lty = 1)

subset <- (Clonality != "Mid")
logrank.test <- survdiff(Surv(survival, status) ~ Clonality, 
                         subset=subset)
logrank.test
## Call:
## survdiff(formula = Surv(survival, status) ~ Clonality, subset = subset)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## Clonality=High 26       11    11.14   0.00171   0.00378
## Clonality=Low  16       10     9.86   0.00193   0.00378
## 
##  Chisq= 0  on 1 degrees of freedom, p= 1
subset <- (Clonality != "Mid")
fit.cox <- coxph(Surv(survival, status) ~ Clonality,
                 subset=subset)
summary(fit.cox)
## Call:
## coxph(formula = Surv(survival, status) ~ Clonality, subset = subset)
## 
##   n= 42, number of events= 21 
## 
##                 coef exp(coef) se(coef)     z Pr(>|z|)
## ClonalityLow 0.02739   1.02777  0.44531 0.062    0.951
## ClonalityMid      NA        NA  0.00000    NA       NA
## 
##              exp(coef) exp(-coef) lower .95 upper .95
## ClonalityLow     1.028      0.973    0.4294      2.46
## ClonalityMid        NA         NA        NA        NA
## 
## Concordance= 0.478  (se = 0.057 )
## Likelihood ratio test= 0  on 1 df,   p=1
## Wald test            = 0  on 1 df,   p=1
## Score (logrank) test = 0  on 1 df,   p=1
summary(fit.cox)$logtest[1]
##        test 
## 0.003782242
summary(fit.cox)$waldtest[1]
## test 
##    0
summary(fit.cox)$sctest[1]
##        test 
## 0.003784237